In [1]:
import sys
sys.path.append('/Users/erickpeirson/tethne')
In [2]:
import matplotlib.pyplot as plt
In [3]:
from tethne.readers import dfr
In [4]:
datapath = ['/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.cHrmED8A',
'/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9',
'/Users/erickpeirson/Desktop/cleanup/JStor DfR Datasets/2013.5.3.k2HUvXh9']
In [5]:
outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldaout'
temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/ldatemp'
In [6]:
papers = [ p for path in datapath for p in dfr.read(path) ]
In [7]:
len(papers)
Out[7]:
In [8]:
wordcounts = {}
for path in datapath:
w = dfr.ngrams(path, 'uni')
wordcounts.update(w)
In [9]:
from nltk.corpus import stopwords
In [10]:
stoplist = stopwords.words()
In [11]:
from tethne import Corpus
In [12]:
D = Corpus(papers, features={'wordcounts': wordcounts}, index_by='doi', exclude=stoplist)
In [13]:
def filt(s, C, DC):
if C > 3 and DC > 1 and len(s) > 3:
return True
return False
In [14]:
D.filter_features('wordcounts', 'wordcounts_filtered', filt)
In [15]:
len(D.features['wordcounts']['index']), len(D.features['wordcounts_filtered']['index'])
Out[15]:
In [16]:
D.slice('date', method='time_period', window_size=5)
In [17]:
D.plot_distribution('date')
In [18]:
D.slice('jtitle')
In [21]:
D.plot_distribution('date', 'jtitle', aspect=0.1, interpolation='none')
In [23]:
D.get_by([('date',1946), ('date',1951)], include_papers=False)
In [19]:
from tethne.model.managers import DTMModelManager
In [20]:
dtm_path = '/Users/erickpeirson/tethne/tethne/model/bin/main'
In [21]:
dtm_outpath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout'
dtm_temppath = '/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmtemp'
In [22]:
DM = DTMModelManager(D, feature='wordcounts_filtered', outpath=dtm_outpath,
temppath=dtm_temppath, dtm_path=dtm_path)
In [23]:
DM.prep()
In [24]:
DM.build()
Out[24]:
In [25]:
import cPickle as pickle
In [26]:
with open('/Users/erickpeirson/tethne/doc/notebooks/sandbox/dtmout/DTMModelManager.pickle', 'wb') as f:
pickle.dump(DM, f)
In [28]:
DM.list_topic_diachronic(1)
Out[28]:
In [30]:
import networkx as nx
In [43]:
g = nx.Graph(name='my graph')
In [47]:
g.add_edge(1,3, weight=0.5)
In [49]:
g.add_node(1, size=0.3)
In [50]:
g.__dict__
Out[50]:
In [51]:
from scipy.sparse import coo_matrix
In [191]:
I = [0,1,2,3,3]
J = [1,1,3,0,1]
K = [1, 2, 3, 4, 5]
In [192]:
A = coo_matrix((K, (I, J)))
In [211]:
zip(A.nonzero()[0], A.nonzero()[1])
Out[211]:
In [194]:
B = A.tocsr()
In [195]:
B[0,1]
Out[195]:
In [196]:
C = A.tolil()
In [197]:
A.nonzero()[0]
Out[197]:
In [198]:
C.nonzero()
Out[198]:
In [203]:
list(set(B[0,:].nonzero()[1]) | set(B[:,0].nonzero()[0]))
Out[203]:
In [200]:
A.nonzero()
Out[200]:
In [201]:
class SA(object):
def __getitem__(self, indices):
i, j = indices
print i, j
In [202]:
SA()[0,1]
In [214]:
g.edges(data=True)
Out[214]:
In [216]:
A.data
Out[216]:
In [218]:
g.edge
Out[218]:
In [219]:
from tethne.persistence.hdf5.graphcollection import HDF5Graph
In [ ]: